#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = 'wshu'
__version__ = '1.0'
"""
    ***********************************
    *  @filename : chartsetck.py
    *  @Author : wshu
    *  @CodeDate : 2020/4/15 21:07
    *  @Software : PyCharm
    ***********************************
"""

import re
import chardet

# 忽略JavaScript里的meta，例如这个页面：http://www.xx007.com/inc/print.js
META_RE = re.compile(r'<meta[a-zA-Z\-="\s/;]+charset="?([^">]+)', re.I)


def check(item):
    charset = ''
    content_type = item['content_type']
    html = item['body']

    if content_type:
        ct = content_type.lower()
        i = ct.find('charset=')
        if i != -1:
            charset = ct[i + 8:].split(';')[0]

    if html and not charset:
        ct = META_RE.search(html)
        if ct:
            charset = ct.group(1)

    if html and not charset:
        lines = html.split('\n')
        for i in [10, 30, 100]:
            charset = chardet.detect('\n'.join(lines[:i]))['encoding']
            if charset and charset.lower() != 'ascii':
                break

    if not charset:
        charset = ''
    if charset.lower() == 'gb2312':
        charset = 'gb18030'  # gb18030支持的字符数量更多 且向下兼容
    return charset.lower()


if __name__ == '__main__':
    import urllib2
    import sys
    import socket

    socket.setdefaulttimeout(8)
    try:
        url = sys.argv[1]
    except:
        print('Usage: python charsetck.py http://www.knownsec.com/')
        sys.exit(0)

    req = urllib2.Request(url)
    req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
    usock = urllib2.urlopen(req)
    item = {'body': usock.read()}
    if usock.headers.dict.has_key('content-type'):
        item['content_type'] = usock.headers.dict['content-type']
    usock.close()

    print(check(item))